import scrapy
# 分布式

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule
from scrapy_redis.spiders import RedisCrawlSpider
from hongniang.items import HongniangItem

class HongniangSpider(RedisCrawlSpider):
    name = 'hongNiang'
    allowed_domains = ['hongniang.com']
    start_urls = ['http://www.hongniang.com']
    # lpush hongniangSpider:start_urls http://www.hongniang.com/match
    redis_key = "hongniangSpider:start_urls"

    # 动态域范围获取
    def __init__(self, *args, **kwargs):
        # Dynamically define the allowed domains list.
        domain = kwargs.pop('domain', '')
        self.allowed_domains = filter(None, domain.split(','))
        super(HongniangSpider, self).__init__(*args, **kwargs)

    # 每一页匹配规则
    page_links = LinkExtractor(allow=(r"hongniang.com/match?&page=\d+"))
    # 每个人个人主页匹配规则
    # profile_links = LinkExtractor(allow=(r"hongniang.com/user/member/id/\d+"))
    profile_links = LinkExtractor(allow=(r"http://www.hongniang.com/user/\d/e/\d+.html", r"hongniang.com/user/member/id/\d+"))
    rules = (
        # 没有回调函数，说明follow是True
        Rule(page_links),
        # 有回调函数，说明follow是False
        Rule(profile_links, callback='parse_item', follow=True)
    )

    def parse_item(self, response):
        print('aqie', response.request.headers['User-Agent'], '\n')
        item = HongniangItem()
        username = response.xpath("normalize-space(//div[@class='name nickname']/text())").extract_first()
        age = response.xpath(
            "normalize-space(//div[@class='mem_main']/div[@class='sub1']/div[@class='right']/div[@class='info2']/div["
            "1]/ul[1]/li[1]/text())").extract_first()
        header_link = response.xpath(
            "//div[@class='mem_main']/div[@class='sub1']/div[@class='left']/div[@id='tFocus']/div["
            "@id='tFocusBtn']/div[@id='tFocus-btn']/ul//img/@src").extract_first()
        images_url = response.xpath(
            "//div[@class='mem_main']/div[@class='sub1']/div[@class='left']/div[@id='tFocus']/div["
            "@id='tFocusBtn']/div[@id='tFocus-btn']/ul//img/@src").extract()
        content = response.xpath(
            "normalize-space(//div[@class='mem_main']/div[@class='sub1']/div[@class='right']/div[@class='info5']/div[@class='text']/text())").extract_first()
        place_from = response.xpath(
            "normalize-space(//div[@class='mem_main']/div[@class='sub2']/div[@class='info1'][1]/div[@class='right']/ul[2]/li[1]/text())").extract_first()
        education = response.xpath(
            "normalize-space(//div[@class='mem_main']/div[@class='sub1']/div[@class='right']/div[@class='info2']/div/ul[2]/li[2]/text())").extract_first()
        hobby = response.xpath(
            "normalize-space(//div[@class='mem_main']//div[@class='sub2']/div[@class='info1'][2]/div[@class='right'][1]/ul[1]/li[4]/text())").extract_first()
        item["username"] = username
        item["age"] = age
        item["header_link"] = header_link
        item["images_url"] = images_url
        item["content"] = content
        item["place_from"] = place_from
        item["education"] = education
        item["hobby"] = hobby
        item["source_url"] = response.url
        # 数据来源网站
        item["source"] = "hongniang"
        yield item
